其他
如何使用Python玩转PDF各种骚操作?
The following article is from Python数据科学 Author wLsq
↑↑↑点击上方“蓝字”,关注“极客猴”
如果你喜欢极客猴,可以把我置顶或加为星标
阅读文本大概需要 6 分钟。
Portable Document Format(可移植文档格式),或者PDF是一种文件格式,可以用于跨操作系统的呈现和文档交换。尽管PDF最初是由Adobe发明的,但它现在是由国际标准化组织(ISO)维护的开放标准。你可以通过使用PyPDF2包在Python中处理已先存在的PDF。
从Python中提取PDF中的文档信息
旋转页面
合并PDF
拆分PDF
添加水印
加密PDF
$ pip install pypdf2
Author
Creator
Producer
Subject
Title
Number of page
from PyPDF2 import PdfFileReader
def extract_information(pdf_path):
with open(pdf_path, 'rb') as f:
pdf = PdfFileReader(f)
information = pdf.getDocumentInfo()
number_of_pages = pdf.getNumPages()
txt = f"""
Information about {pdf_path}:
Author: {information.author}
Creator: {information.creator}
Producer: {information.producer}
Subject: {information.subject}
Title: {information.title}
Number of pages: {number_of_pages}
"""
print(txt)
return information
if __name__ == '__main__':
path = 'xxxx.pdf'
extract_information(path)
from PyPDF2 import PdfFileReader, PdfFileWriter
def rotate_pages(pdf_path):
pdf_writer = PdfFileWriter()
pdf_reader = PdfFileReader(path)
# 顺时针旋转90度
page_1 = pdf_reader.getPage(0).rotateClockwise(90)
pdf_writer.addPage(page_1)
# 逆时针旋转90度
page_2 = pdf_reader.getPage(1).rotateCounterClockwise(90)
pdf_writer.addPage(page_2)
# 在正常方向上添加一页
pdf_writer.addPage(pdf_reader.getPage(2))
with open('rotate_pages.pdf', 'wb') as fh:
pdf_writer.write(fh)
if __name__ == '__main__':
path = '新路径.pdf'
rotate_pages(path)
from PyPDF2 import PdfFileReader, PdfFileWriter
def merge_pdfs(paths, output):
pdf_writer = PdfFileWriter()
for path in paths:
pdf_reader = PdfFileReader(path)
for page in range(pdf_reader.getNumPages()):
# 将每页添加到writer对象
pdf_writer.addPage(pdf_reader.getPage(page))
# 写入合并的pdf
with open(output, 'wb') as out:
pdf_writer.write(out)
if __name__ == '__main__':
paths = ['document1.pdf', 'document2.pdf']
merge_pdfs(paths, output='merged.pdf')
from PyPDF2 import PdfFileReader, PdfFileWriter
def split(path, name_of_split):
pdf = PdfFileReader(path)
for page in range(pdf.getNumPages()):
pdf_writer = PdfFileWriter()
pdf_writer.addPage(pdf.getPage(page))
output = f'{name_of_split}{page}.pdf'
with open(output, 'wb') as output_pdf:
pdf_writer.write(output_pdf)
if __name__ == '__main__':
path = 'xxx.pdf'
split(path, 'jupyter_page')
from PyPDF2 import PdfFileWriter, PdfFileReader
def create_watermark(input_pdf, output, watermark):
watermark_obj = PdfFileReader(watermark)
watermark_page = watermark_obj.getPage(0)
pdf_reader = PdfFileReader(input_pdf)
pdf_writer = PdfFileWriter()
# 给所有页面添加水印
for page in range(pdf_reader.getNumPages()):
page = pdf_reader.getPage(page)
page.mergePage(watermark_page)
pdf_writer.addPage(page)
with open(output, 'wb') as out:
pdf_writer.write(out)
if __name__ == '__main__':
create_watermark(
input_pdf='Jupyter_Notebook_An_Introduction.pdf',
output='watermarked_notebook.pdf',
watermark='watermark.pdf')
input_pdf:要加水印的PDF文件路径
output:要保存PDF的水印版本的路径
watermark:包含水印图像或文本的PDF
from PyPDF2 import PdfFileWriter, PdfFileReader
def add_encryption(input_pdf, output_pdf, password):
pdf_writer = PdfFileWriter()
pdf_reader = PdfFileReader(input_pdf)
for page in range(pdf_reader.getNumPages()):
pdf_writer.addPage(pdf_reader.getPage(page))
pdf_writer.encrypt(user_pwd=password, owner_pwd=None,
use_128bit=True)
with open(output_pdf, 'wb') as fh:
pdf_writer.write(fh)
if __name__ == '__main__':
add_encryption(input_pdf='reportlab-sample.pdf',
output_pdf='reportlab-encrypted.pdf',
password='twofish')
计算机学生如何规划好大学四年的学习?
如何将 Pycharm 打造得更称手
技术·思考·职场
长按二维码,添加关注!